/**********************************************************************
gc_4_firm_info.do

**********************************************************************/
**********
* SET UP *
**********
clear all
set matsize 2000
set more 1

* location for dofiles *
cd "T:\_Projet_4915\dofiles"
global dir "`c(pwd)'"
cd $dir

*********
* GATES *
*********
* Specify which data you want to work with (synthetic = syn, real = rl) *
local ext = "rl"

* STEP 1: prepare dataset for analysis *
local gate1 = 1

* STEP 2: Firm characteristics, descriptives *
local gate2 = 1

* STEP 3: delete all intermediate datasets *
local gate3 = 1

* start log file *
quietly capture log close
quietly log using gc_4_firm_info_`ext', text replace

* specify file locations *
global project_folder "\_Projet_4915"
global data_folder "\_Projet_4915\DATA"
global output_folder "\_Projet_4915\ResultsFolder"
global temp "temp"

local datadir T:\${data_folder}\

*******************************************
* MAKE A GLOBAL VARIABLE FOR TODAY'S DATE *
*******************************************
local tyr = substr("$S_DATE",8,4)
local tmo = substr("$S_DATE",4,3)
local tmd = trim(substr("$S_DATE",1,2))

* make day of month two digits *
local wl = length("`tmd'")
if `wl'==1 {
local tmd2 ="0"+"`tmd'"
}
if `wl'!=1 {
local tmd2 ="`tmd'"
}

* get numeric month, make it two digits *
local di="`tmd2'"+"`tmo'"+"`tyr'"
local edate = date("`di'", "DMY")
local mono = month(`edate')
local ml = length("`mono'")
if `ml'==1 {
local mono2 ="0"+"`mono'"
}
if `ml'!=1 {
local mono2 ="`mono'"
}

* put final date together *
global date = "`tyr'"+"`mono2'"+"`tmd2'"

****************************************
* DEFINE LITTLE PROGRAMS TO PRINT TIME *
****************************************
program define starttime
	display "Started processing at $S_TIME on $S_DATE"
end

program define endtime
	display "Finished processing at $S_TIME on $S_DATE"
end

************************
* START OF THE PROGRAM *
************************

****************************************
* STEP 1: prepare dataset for analysis *
****************************************
disp "***** Started processing STEP 1 *****"
starttime
if 1 == `gate1' ///
{
	disp "***** STEP 1: prepare dataset for analysis *****"
	local datadir T:\${data_folder}\
	
	use "`datadir'gc_firm_panel_`ext'.dta", clear
	
	sort eid_long tax_yr
	
	******************************
	* drop if missing naics code *
	******************************
	drop if missing(naics4_int)
	
	******************************
	* define industry categories *
	******************************
	gen naics2 = substr(naics4,1,2)
	destring naics2, gen(naics2_int)
	gen naics3 = substr(naics4,1,3)
	destring naics3, gen(naics3_int)
	
	/* NOTE: I exclude the following industries:
	Agriculture, forestry, fishing and hunting (naics2 = 11)
	Natural resource industries and utilities (naics2 == "21" | naics2 == "22")
	Educational services (naics2 = 61)
	Health care and social assistance (naics2 = 62)
	Public administration (naics2 = 91) */
	gen naics = ""
	
	* Construction *
	replace naics = "2" if naics2 == "23"
	
	* Manufacturing *
	replace naics = "3" if naics2 == "31" | naics2 == "32" | naics2 == "33"
	
	* Wholesalers - Distributors *
	replace naics = "4" if naics2 == "41"
	
	* Sales *
	replace naics = "5" if naics2 == "44" | naics2 == "45"
	
	* FIRE *
	replace naics = "6" if naics2 == "52" | naics2 == "53"
	
	* Professionals *
	replace naics = "7" if naics2 == "54"
	
	* Other Services *
	replace naics = "8" if naics2 == "48" | naics2 == "49"
	replace naics = "8" if naics2 == "51" | naics2 == "55" | naics2 == "56"
	replace naics = "8" if naics2 == "71" | naics2 == "72"
	replace naics = "8" if naics2 == "81"
	
	*********************************
	* keep only selected industries *
	*********************************
	drop if naics == ""
	
	* display the number of firms *
	by eid_long: gen counter = _n
	count if counter == 1
	display "There are now `r(N)' firms in the dataset"
	drop counter
	
	***********************************************************************
	* randomly select a sample of 10000 individuals to speed up debugging * 
	***********************************************************************
	if 1 == 2 ///
	{
		by eid_long: gen double ui=floor(100000*10000*runiform()+1) if _n==1
		by eid_long: replace ui=ui[1]
		bys ui: gen uii=_n==1
		replace uii=sum(uii)
		drop if uii > 10000
		drop uii ui
		sort eid_long tax_yr
		
		* display the number of firms *
		by eid_long: gen counter = _n
		count if counter == 1
		display "There are now `r(N)' firms in the dataset"
		drop counter
	}

	* employment *
	gen employment = pd7_avgemp_nonzero
	replace employment = . if employment < 1
	replace employment = t4_ilu if missing(employment)
	replace employment = . if employment < 1
	
	* revenue *
	gen revenue = sales_goods_and_services
	replace revenue = . if revenue <= 1
	replace revenue = total_revenue if missing(revenue)
	replace revenue = . if revenue <= 1
	
	* revenue per emp *
	gen rev_perL = revenue/employment
	
	* payroll *
	gen payroll = pd7_totalpayroll
	replace payroll = . if payroll <= 1
	replace payroll = t4_payroll if missing(payroll)
	replace payroll = . if payroll <= 1
	
	* payroll per emp *
	gen payroll_perL = payroll/employment
	
	save "`datadir'gc_firm_tmp_`ext'.dta", replace
	
	* drop firm-year observations no with employment, payroll or revenue info*
	drop if employment == .
	by eid_long: gen counter = _n
	count if counter == 1
	display "There are now `r(N)' firms in the dataset"
	drop counter
	drop if payroll == .
	by eid_long: gen counter = _n
	count if counter == 1
	display "There are now `r(N)' firms in the dataset"
	drop counter
	drop if revenue == .
	by eid_long: gen counter = _n
	count if counter == 1
	display "There are now `r(N)' firms in the dataset"
	drop counter

	* take out industry-year fixed effects and save residuals, to be averaged at the firm level below *
	sort eid_long tax_yr
	foreach var in employment rev_perL payroll_perL ///
	{
		gen ln_`var' = ln(`var')
		reg ln_`var' i.tax_yr
		predict e1_`var', residuals
		reg ln_`var' i.tax_yr#i.naics2_int
		predict e2_`var', residuals
	}

	foreach var in employment rev_perL payroll_perL ///
	{
		by eid_long: egen avg_`var' = mean(`var')
		by eid_long: egen avg_ln_`var' = mean(ln_`var')
		by eid_long: egen avg_e1_`var' = mean(e1_`var')
		by eid_long: egen avg_e2_`var' = mean(e2_`var')
	}
	by eid_long: egen tot_emp = total(employment)
	
	* keep only one observation per firm *
	bys eid_long: gen counter = _n
	keep if counter == 1
	drop counter
	
	keep eid_long naics* avg* tot_emp
	order eid_long naics* avg* tot_emp

	* construct various firm categories (all weighted by average firm employment) *
	gen avg_emp_int = round(avg_employment,1)
	replace tot_emp = round(tot_emp,1)
	expand avg_emp_int
	*expand tot_emp
	drop tot_emp avg_emp_int
	foreach var in employment rev_perL payroll_perL ///
	{

		egen p_25pctile = pctile(avg_e1_`var'), p(25)
		egen p_33pctile = pctile(avg_e1_`var'), p(33)
		egen p_50pctile = pctile(avg_e1_`var'), p(50)
		egen p_66pctile = pctile(avg_e1_`var'), p(66)
		egen p_75pctile = pctile(avg_e1_`var'), p(75)
		gen f1_`var'_4c = .
		replace f1_`var'_4c = 1 if avg_e1_`var' > p_75pctile & avg_e1_`var' != .
		replace f1_`var'_4c = 2 if avg_e1_`var' > p_50pctile & avg_e1_`var' <= p_75pctile
		replace f1_`var'_4c = 3 if avg_e1_`var' > p_25pctile & avg_e1_`var' <= p_50pctile
		replace f1_`var'_4c = 4 if avg_e1_`var' <= p_25pctile
		label var f1_`var'_4c "Firm class, avg ln `var', res 1, (4 cats)"
		gen f1_`var'_3c = .
		replace f1_`var'_3c = 1 if avg_e1_`var' > p_66pctile & avg_e1_`var' != .
		replace f1_`var'_3c = 2 if avg_e1_`var' > p_33pctile & avg_e1_`var' <= p_66pctile
		replace f1_`var'_3c = 3 if avg_e1_`var' <= p_33pctile
		label var f1_`var'_3c "Firm class, avg ln `var', res 1, (3 cats)"
		drop p_*
		
		egen p_25pctile = pctile(avg_e2_`var'), p(25)
		egen p_33pctile = pctile(avg_e2_`var'), p(33)
		egen p_50pctile = pctile(avg_e2_`var'), p(50)
		egen p_66pctile = pctile(avg_e2_`var'), p(66)
		egen p_75pctile = pctile(avg_e2_`var'), p(75)
		gen f2_`var'_4c = .
		replace f2_`var'_4c = 1 if avg_e2_`var' > p_75pctile & avg_e2_`var' != .
		replace f2_`var'_4c = 2 if avg_e2_`var' > p_50pctile & avg_e2_`var' <= p_75pctile
		replace f2_`var'_4c = 3 if avg_e2_`var' > p_25pctile & avg_e2_`var' <= p_50pctile
		replace f2_`var'_4c = 4 if avg_e2_`var' <= p_25pctile
		label var f2_`var'_4c "Firm class, avg ln `var', res 2, (4 cats)"
		gen f2_`var'_3c = .
		replace f2_`var'_3c = 1 if avg_e2_`var' > p_66pctile & avg_e2_`var' != .
		replace f2_`var'_3c = 2 if avg_e2_`var' > p_33pctile & avg_e2_`var' <= p_66pctile
		replace f2_`var'_3c = 3 if avg_e2_`var' <= p_33pctile
		label var f2_`var'_3c "Firm class, avg ln `var', res 2, (3 cats)"
		drop p_*
		
	}
	sort eid_long
	
	* keep only one observation per firm *
	bys eid_long: gen counter = _n
	keep if counter == 1
	count if counter == 1
	display "There are now `r(N)' firms in the dataset"
	drop counter
	
	sum *
	
	* save intermediate dataset to the data_folder *
	local datadir T:\${data_folder}\
	compress
	save "`datadir'gc_firm_info_`ext'.dta", replace
	
	disp "***** STEP 1: prepare dataset for analysis (COMPLETED) *****"
}
disp "***** Finished processing STEP 1 *****"
endtime

**********************************************
* STEP 2: Firm characteristics, descriptives *
**********************************************
disp "***** Started processing STEP 2 *****"
starttime
if 1 == `gate2' ///
{
	disp "***** STEP 2: Firm characteristics, descriptives *****"
	
	use "`datadir'gc_firm_tmp_`ext'.dta", clear
	sort eid_long tax_yr
	
	merge m:1 eid_long using "`datadir'gc_firm_info_`ext'.dta"
	keep if _merge == 3
	drop _merge
	
	merge m:1 eid_long using "`datadir'gc_akm_v1_men_firm_fe_`ext'.dta"
	drop if _merge == 2
	drop _merge
	
	gen emp_int = round(employment,1.0)
	
	gen firm_class = 0
	replace firm_class = 1 if f2_payroll_perL_4c == 1
	replace firm_class = 2 if f2_payroll_perL_4c == 2
	replace firm_class = 3 if f2_payroll_perL_4c == 3 | f2_payroll_perL_4c == 4
	
	display "...Now reporting summary statistics at the firm-year level..."
	local datadir T:\${data_folder}\gc_esamples\
	saveold "`datadir'gc_ss_firms.dta", replace v(12)
	
	local datadir T:\${data_folder}\
	display "...Unweighted..."
	sum employment rev_perL payroll_perL firm_fe
	
	display "...Weighted by employment..."
	sum employment rev_perL payroll_perL firm_fe [fweight = emp_int]
	
	foreach fcat in "firm_class" ///
	{
		display "...Now reporting summary statistics at the firm-year level by `fcat'..."
		
		display "...Unweighted..."
		bys `fcat': sum employment rev_perL payroll_perL firm_fe
		
		display "...Weighted by employment..."
		bys `fcat': sum employment rev_perL payroll_perL firm_fe [fweight = emp_int]
	}
	
	keep if tax_yr == 2012
	
	display "...Now reporting summary statistics at the firm-year level..."
	local datadir T:\${data_folder}\gc_esamples\
	saveold "`datadir'gc_ss_firms_2012.dta", replace v(12)
	
	local datadir T:\${data_folder}\
	display "...Unweighted..."
	sum employment rev_perL payroll_perL firm_fe
	
	display "...Weighted by employment..."
	sum employment rev_perL payroll_perL firm_fe [fweight = emp_int]
	
	foreach fcat in "firm_class" ///
	{
		display "...Now reporting summary statistics at the firm-year level by `fcat'..."
		
		display "...Unweighted..."
		bys `fcat': sum employment rev_perL payroll_perL firm_fe
		
		display "...Weighted by employment..."
		bys `fcat': sum employment rev_perL payroll_perL firm_fe [fweight = emp_int]
	}
	
	
	disp "***** STEP 2: Firm characteristics, descriptives (COMPLETED) *****"
}
disp "***** Finished processing STEP 2 *****"
endtime

********************************************
* STEP 3: delete all intermediate datasets *
********************************************
disp "***** Started processing STEP 3 *****"
starttime
if 1 == `gate3' ///
{
	disp "***** STEP 3: delete all intermediate datasets *****"
			
	disp "***** STEP 3: delete all intermediate datasets (COMPLETED)*****"
}
disp "***** Finished processing STEP 3 *****"
endtime

********
* EXIT *
********
clear all
log close
